#!/usr/bin/perl
#
# tcpretrans - show TCP retransmts, with address and other details.
#              Written using Linux ftrace.
#
# This traces TCP retransmits, showing address, port, and TCP state information,
# and sometimes the PID (although usually not, since retransmits are usually
# sent by the kernel on timeouts). To keep overhead low, only
# tcp_retransmit_skb() calls are traced (this does not trace every packet).
#
# USAGE: ./tcpretrans [-hls]
#
# REQUIREMENTS: FTRACE and KPROBE CONFIG, tcp_retransmit_skb() kernel function,
# and tcp_send_loss_probe() when -l is used. You may have these already have
# these on recent kernels. And Perl.
#
# This was written as a proof of concept for ftrace, for older Linux systems,
# and without kernel debuginfo. It uses dynamic tracing of tcp_retransmit_skb(),
# and reads /proc/net/tcp for socket details. Its use of dynamic tracing and
# CPU registers is an unstable platform-specific workaround, and may require
# modifications to work on different kernels and platforms. This would be better
# written using a tracer such as SystemTap, and will likely be rewritten in the
# future when certain tracing features are added to the Linux kernel.
#
# When -l is used, this also uses dynamic tracing of tcp_send_loss_probe() and
# a register.
#
# Currently only IPv4 is supported, on x86_64. If you try this on a different
# architecture, you'll likely need to adjust the register locations (search
# for %di).
#
# OVERHEAD: The CPU overhead is relative to the rate of TCP retransmits, and is
# designed to be low as this does not examine every packet. Once per second the
# /proc/net/tcp file is read, and a buffer of retransmit trace events is
# retrieved from the kernel and processed.
#
# From perf-tools: https://github.com/brendangregg/perf-tools
#
# See the tcpretrans(8) man page (in perf-tools) for more info.
#
# COPYRIGHT: Copyright (c) 2014 Brendan Gregg.
#
#  This program is free software; you can redistribute it and/or
#  modify it under the terms of the GNU General Public License
#  as published by the Free Software Foundation; either version 2
#  of the License, or (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software Foundation,
#  Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#
#  (http://www.gnu.org/copyleft/gpl.html)
#
# 28-Jul-2014	Brendan Gregg	Created this.

use strict;
use warnings;
use POSIX qw(strftime);
use Getopt::Long;
my $tracing = "/sys/kernel/debug/tracing";
my $flock = "/var/tmp/.ftrace-lock";
my $interval = 1;
local $SIG{INT} = \&cleanup;
local $SIG{QUIT} = \&cleanup;
local $SIG{TERM} = \&cleanup;
local $SIG{PIPE} = \&cleanup;
local $SIG{HUP} = \&cleanup;
$| = 1;

### options
my ($help, $stacks, $tlp);
GetOptions("help|h"   => \$help,
	   "stacks|s" => \$stacks,
	   "tlp|l" => \$tlp)
or usage();
usage() if $help;

sub usage {
	print STDERR "USAGE: tcpretrans [-hls]\n";
	print STDERR "                  -h      # help message\n";
	print STDERR "                  -l      # trace TCP tail loss probes\n";
	print STDERR "                  -s      # print stack traces\n";
	print STDERR "   eg,\n";
	print STDERR "       tcpretrans         # trace TCP retransmits\n";
	exit;
}

# delete lock and die
sub ldie {
	unlink $flock;
	die @_;
}

# end tracing (silently) and die
sub edie {
	print STDERR "@_\n";
	close STDOUT;
	close STDERR;
	cleanup();
}

sub writeto {
	my ($string, $file) = @_;
	open FILE, ">$file" or return 0;
	print FILE $string or return 0;
	close FILE or return 0;
}

sub appendto {
	my ($string, $file) = @_;
	open FILE, ">>$file" or return 0;
	print FILE $string or return 0;
	close FILE or return 0;
}

# kprobe functions
sub create_kprobe {
	my ($kname, $kval) = @_;
	appendto "p:$kname $kval", "kprobe_events" or return 0;
}

sub enable_kprobe {
	my ($kname) = @_;
	writeto "1", "events/kprobes/$kname/enable" or return 0;
}

sub remove_kprobe {
	my ($kname) = @_;
	writeto "0", "events/kprobes/$kname/enable" or return 0;
	appendto "-:$kname", "kprobe_events" or return 0;
}

# tcp socket cache
my %tcp;
sub cache_tcp {
	undef %tcp;
	open(TCP, "/proc/net/tcp") or ldie "ERROR: reading /proc/net/tcp.";
	while (<TCP>) {
		next if /^ *sl/;
		my ($sl, $local_address, $rem_address, $st, $tx_rx, $tr_tm,
		    $retrnsmt, $uid, $timeout, $inode, $jf, $sk) = split;
		$sk =~ s/^0x//;
		$tcp{$sk}{laddr} = $local_address;
		$tcp{$sk}{raddr} = $rem_address;
		$tcp{$sk}{state} = $st;
	}
	close TCP;
}

my @tcpstate;
sub map_tcp_states {
	push @tcpstate, "NULL";
	for (<DATA>) {
		chomp;
		s/.*TCP_//;
		s/[, ].*$//;
		push @tcpstate, $_;
	}
}

# /proc/net/tcp hex addr to dotted quad decimal
sub inet_h2a {
	my ($haddr) = @_;

	my @addr = ();
	for my $num ($haddr =~ /(..)(..)(..)(..)/) {
		unshift @addr, hex($num);
	}
	return join(".", @addr);
}

### check permissions
chdir "$tracing" or die "ERROR: accessing tracing. Root? Kernel has FTRACE?" .
    "\ndebugfs mounted? (mount -t debugfs debugfs /sys/kernel/debug)";

### ftrace lock
if (-e $flock) {
	open FLOCK, $flock; my $fpid = <FLOCK>; chomp $fpid; close FLOCK;
	die "ERROR: ftrace may be in use by PID $fpid ($flock)";
}
writeto "$$", $flock or die "ERROR: unable to write $flock.";

#
# Setup and begin tracing.
# Use of ldie() and edie() ensures that if an error is encountered, the
# kernel is not left in a partially configured state.
#
writeto "nop", "current_tracer" or ldie "ERROR: disabling current_tracer.";
my $kname_rtr = "tcpretrans_tcp_retransmit_skb";
my $kname_tlp = "tcpretrans_tcp_send_loss_probe";
create_kprobe $kname_rtr, "tcp_retransmit_skb sk=%di" or
    ldie "ERROR: creating kprobe for tcp_retransmit_skb().";;
if ($tlp) {
	create_kprobe $kname_tlp, "tcp_send_loss_probe sk=%di" or
	    edie "ERROR: creating kprobe for tcp_send_loss_probe(). " .
	         "Older kernel version?";
}
if ($stacks) {
	writeto "1", "options/stacktrace" or print STDERR "WARNING: " .
	    "unable to enable stacktraces.\n";
}
enable_kprobe $kname_rtr or edie "ERROR: enabling $kname_rtr probe.";
if ($tlp) {
	enable_kprobe $kname_tlp or edie "ERROR: enabling $kname_tlp probe.";
}
map_tcp_states();
printf "%-8s %-6s %-20s -- %-20s %-12s\n", "TIME", "PID", "LADDR:LPORT",
    "RADDR:RPORT", "STATE";

#
# Read and print event data. This loop waits one second then reads the buffered
# trace data, then caches /proc/net/tcp, then iterates over the buffered trace
# data using the cached state. While this minimizes CPU overheads, it only
# works because sockets that are retransmitting are usually long lived, and
# remain in /proc/net/tcp for at least our sleep interval.
#
while (1) {
	sleep $interval;

	# buffer trace data
	open TPIPE, "trace" or edie "ERROR: opening trace_pipe.";
	my @trace = ();
	while (<TPIPE>) {
		next if /^#/;
		push @trace, $_;
	}
	close TPIPE;
	writeto "0", "trace" or edie "ERROR: clearing trace";

	# cache /proc/net/tcp state
	if (scalar @trace) {
		cache_tcp();
	}

	# process and print events
	for (@trace) {
		if ($stacks && /^ *=>/) {
			print $_;
			next;
		}

		my ($taskpid, $rest) = split ' ', $_, 2;
		my ($task, $pid) = $taskpid =~ /(.*)-(\d+)/;

		my ($skp) = $rest =~ /sk=([0-9a-fx]*)/;
		next unless defined $skp and $skp ne "";
		$skp =~ s/^0x//;

		my ($laddr, $lport, $raddr, $rport, $state);
		if (defined $tcp{$skp}) {
			# convert /proc/net/tcp hex to dotted quads
			my ($hladdr, $hlport) = split /:/, $tcp{$skp}{laddr};
			my ($hraddr, $hrport) = split /:/, $tcp{$skp}{raddr};
			$laddr = inet_h2a($hladdr);
			$raddr = inet_h2a($hraddr);
			$lport = hex($hlport);
			$rport = hex($hrport);
			$state = $tcpstate[hex($tcp{$skp}{state})];
		} else {
			# socket closed too quickly
			($laddr, $raddr) = ("-", "-");
			($lport, $rport) = ("-", "-");
			$state = "-";
		}

		my $now = strftime "%H:%M:%S", localtime;
		printf "%-8s %-6s %-20s %s> %-20s %-12s\n", $now, $pid,
		    "$laddr:$lport", $rest =~ /$kname_tlp/ ? "L" : "R",
		    "$raddr:$rport", $state,
	}
}

### end tracing
cleanup();

sub cleanup {
	print "\nEnding tracing...\n";
	close TPIPE;
	if ($stacks) {
		writeto "0", "options/stacktrace" or print STDERR "WARNING: " .
		    "unable to disable stacktraces.\n";
	}
	remove_kprobe $kname_rtr
	    or print STDERR "ERROR: removing kprobe $kname_rtr\n";
	if ($tlp) {
		remove_kprobe $kname_tlp
		    or print STDERR "ERROR: removing kprobe $kname_tlp\n";
	}
	writeto "", "trace";
	unlink $flock;
	exit;
}

# from /usr/include/netinet/tcp.h:
__DATA__
  TCP_ESTABLISHED = 1,
  TCP_SYN_SENT,
  TCP_SYN_RECV,
  TCP_FIN_WAIT1,
  TCP_FIN_WAIT2,
  TCP_TIME_WAIT,
  TCP_CLOSE,
  TCP_CLOSE_WAIT,
  TCP_LAST_ACK,
  TCP_LISTEN,
  TCP_CLOSING   /* now a valid state */
